home *** CD-ROM | disk | FTP | other *** search
- Attribute VB_Name = "Functions_HTML"
- Option Explicit
-
- Public Type HTMLTag
- Full As String
- Name As String
- Value As String
- End Type
-
- Public Sub HTMLParse(ByVal urlText As String, ByVal baseURL As String, tagArray() As HTMLTag)
-
- Dim i As Integer ' Keeps track of the size of our array
-
- Dim openTag As Integer ' Holds the position of "<"
- Dim closeTag As Integer ' Holds the position of ">"
-
- Dim curTag As HTMLTag ' Holds the values of the current Tag being looked at
-
- Dim endName As Integer ' Holds the position of the end of the tag name
-
- Dim startValue As Integer ' Holds the postion of the beginning of the tag value
- Dim endValue As Integer ' Holds the position of the end of the tag value
-
- ' Make sure that baseURL ends with a forward slash.
- ' If not, append it.
- If Right(baseURL, 1) <> "/" Then baseURL = baseURL & "/"
-
- ' Get the postion of the first open bracket
- openTag = InStr(1, urlText, "<")
-
- ' Resize our array
- ReDim Preserve tagArray(i)
-
- ' We need to loop through the string until
- ' there are no more tags left (i.e. no more
- ' opening brackets are found)
- Do
- DoEvents
-
- ' Only look for the closing bracket if the
- ' opening bracket was found. Otherwise
- ' we will get an error when trying to get
- ' use mid to get the tag
- If openTag <> 0 Then
- closeTag = InStr(openTag + 1, urlText, ">")
- Else
- Exit Do
- End If
-
- ' If there is no closing bracket (">") for the current tag
- ' then there was an HTML coding error and we will
- ' get stuck in an endless loop, so we set the closing
- ' tag to opentag + 1 and jump to the next iteration
- ' of the loop
- If closeTag = 0 Then closeTag = openTag + 1: GoTo NextIteration
-
- ' .Full is everything between the opening and closing brackets,
- ' including the brackets themselves
- curTag.Full = LCase(Mid(urlText, openTag, closeTag - openTag + 1))
-
- ' Get the tag name and assign it to .Name.
- ' This is done by getting everything after the opening
- ' brack until the first space or the closing bracket.
- endName = InStr(2, curTag.Full, " ") - 2
- If endName = -2 Then endName = InStr(2, curTag.Full, ">") - 2
- curTag.Name = TrimNull(Mid(curTag.Full, 2, endName))
-
- ' Depending on what type of tag we have,
- ' get and assign the value of that tag.
- ' Examples for <A> and <IMG> are given
- ' here. Any other values you need can be
- ' added as necessary using this example.
- Select Case (curTag.Name)
-
- ' If the current tag is an <A> tag then get the
- ' file in href="xxx" and assign it to .value
- Case "a"
-
- ' Position of "href=" in the tag
- startValue = InStr(LCase(curTag.Full), "href=")
-
- ' If its there (not all anchor tags are links, so we
- ' have to be careful), then proceed. Otherwise,
- ' set .Value to null.
- If (startValue <> 0) Then
-
- ' Again, find the end of the value by getting the
- ' location of the next space or a closing bracket.
- endValue = InStr(startValue + 5, curTag.Full, " ") - 1
- If endValue = -1 Then endValue = InStr(startValue + 5, curTag.Full, ">") - 1
-
- ' Only if both these values are not zero, procced
- If ((startValue <> 0) And (endValue <> 0)) Then
-
- ' Strip away quotation marks, and null characters, and set .Value
- curTag.Value = StripQuotes(TrimNull(Mid(curTag.Full, startValue + 5, endValue - startValue - 4)))
-
- ' Do some last minute check here. Some URL's are not
- ' complete URL's in that they only contain the logical
- ' path to the next URL from the current one. If that is the case
- ' we have to append the base URL to the start of the URL.
- If (Left(LCase(curTag.Value), 7) <> "http://") Then
- If (Left(curTag.Value, 1) = "/") Then
- curTag.Value = baseURL & Mid(curTag.Value, 2)
- Else
- curTag.Value = baseURL & curTag.Value
- End If ' (Left(curTag.Value, 1) = "/")
- End If ' (Left(LCase(curTag.Value), 7) <> "http://")
- End If ' ((startValue <> 0) And (endValue <> 0))
-
- Else
- curTag.Value = ""
-
- End If ' (startValue <> 0)
-
- ' If the current tag is an <IMG> tag then get the
- ' file in src="xxx" and assign it to .value
- Case "img"
-
- ' Refer to the above case for commentary
- ' on what is going on below
- startValue = InStr(LCase(curTag.Full), "src=")
- endValue = InStr(startValue + 5, curTag.Full, " ") - 1
- If endValue = -1 Then endValue = InStr(startValue + 5, curTag.Full, ">") - 1
- If startValue <> 0 And endValue <> 0 Then
- curTag.Value = StripQuotes(TrimNull(Mid(curTag.Full, startValue + 5, endValue - startValue - 4)))
- If Left(LCase(curTag.Value), 7) <> "http://" Then
- If Left(curTag.Value, 1) = "/" Then
- curTag.Value = baseURL & Mid(curTag.Value, 2)
- Else
- curTag.Value = baseURL & curTag.Value
- End If
- End If
- End If
-
- ' If we are looking at a tag that doesn't have a
- ' value associated with it, or we are not interested
- ' in its value, so set the value to null
- Case Else
- curTag.Value = ""
-
- End Select ' (curTag.Name)
-
- ' Set the current item of the tagArray to curTag
- tagArray(i) = curTag
-
- ' Increment our tag counter
- ' and change the size of the tagArray
- i = i + 1
- ReDim Preserve tagArray(i)
-
- NextIteration:
- ' Find the position of the next opening bracket in the code
- openTag = InStr(closeTag, urlText, "<")
- Loop Until openTag = 0
-
- End Sub
-
-
-